Formulae : AIC for a Regression problem: \[AIC = 2k + n*Log(RSS/n)\]
RSS - Residual Sum of Squares
n - Number of training examples
k - Number of attributes/independent variables/ predictor variables/ features
rm(list=ls())
df <- read.delim("./UnivBank.csv",sep = ",",na.strings = c(""," ","?","#"))
str(df)
## 'data.frame': 5000 obs. of 14 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 25 45 39 35 35 37 53 50 35 34 ...
## $ Experience : int 1 19 15 9 8 13 27 24 10 9 ...
## $ Income : int 49 34 11 100 45 29 72 22 81 180 ...
## $ ZIP.Code : int 91107 90089 94720 94112 91330 92121 91711 93943 90089 93023 ...
## $ Family : int 4 3 1 1 4 4 2 1 3 1 ...
## $ CCAvg : num 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
## $ Education : int 1 1 1 2 2 2 2 3 2 3 ...
## $ Mortgage : int 0 0 0 0 0 155 0 0 104 0 ...
## $ Personal.Loan : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Securities.Account: int 1 1 0 0 0 0 0 0 0 0 ...
## $ CD.Account : int 0 0 0 NA 0 0 0 0 0 0 ...
## $ Online : int 0 0 0 0 0 1 1 0 1 0 ...
## $ CreditCard : int 0 0 0 0 1 0 0 1 0 0 ...
summary(df)
## ID Age Experience Income
## Min. : 1 Min. :23.00 Min. :-3.0 Min. : 8.00
## 1st Qu.:1251 1st Qu.:35.00 1st Qu.:10.0 1st Qu.: 39.00
## Median :2500 Median :45.00 Median :20.0 Median : 64.00
## Mean :2500 Mean :45.34 Mean :20.1 Mean : 73.77
## 3rd Qu.:3750 3rd Qu.:55.00 3rd Qu.:30.0 3rd Qu.: 98.00
## Max. :5000 Max. :67.00 Max. :43.0 Max. :224.00
##
## ZIP.Code Family CCAvg Education
## Min. : 9307 Min. :1.000 Min. : 0.000 Min. :1.000
## 1st Qu.:91911 1st Qu.:1.000 1st Qu.: 0.700 1st Qu.:1.000
## Median :93437 Median :2.000 Median : 1.500 Median :2.000
## Mean :93152 Mean :2.396 Mean : 1.938 Mean :1.881
## 3rd Qu.:94608 3rd Qu.:3.000 3rd Qu.: 2.500 3rd Qu.:3.000
## Max. :96651 Max. :4.000 Max. :10.000 Max. :3.000
##
## Mortgage Personal.Loan Securities.Account CD.Account
## Min. : 0.00 Min. :0.000 Min. :0.0000 Min. :0.00000
## 1st Qu.: 0.00 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.00000
## Median : 0.00 Median :0.000 Median :0.0000 Median :0.00000
## Mean : 56.52 Mean :0.096 Mean :0.1044 Mean :0.06041
## 3rd Qu.:101.00 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :635.00 Max. :1.000 Max. :1.0000 Max. :1.00000
## NA's :2 NA's :2 NA's :1
## Online CreditCard
## Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.000
## Median :1.0000 Median :0.000
## Mean :0.5968 Mean :0.294
## 3rd Qu.:1.0000 3rd Qu.:1.000
## Max. :1.0000 Max. :1.000
##
sum(is.na(df))
## [1] 5
# dropping Zip
df$ZIP.Code = NULL
df$ID = NULL
# Experience has negative values, change them to 0
df$Experience[df$Experience < 0] <- 0
cat_cols = c("Family","CreditCard","CD.Account","Online","Education","Personal.Loan","Securities.Account")
num_cols = setdiff(colnames(df),cat_cols)
cat_cols
## [1] "Family" "CreditCard" "CD.Account"
## [4] "Online" "Education" "Personal.Loan"
## [7] "Securities.Account"
num_cols
## [1] "Age" "Experience" "Income" "CCAvg" "Mortgage"
summary(df)
## Age Experience Income Family
## Min. :23.00 Min. : 0.00 Min. : 8.00 Min. :1.000
## 1st Qu.:35.00 1st Qu.:10.00 1st Qu.: 39.00 1st Qu.:1.000
## Median :45.00 Median :20.00 Median : 64.00 Median :2.000
## Mean :45.34 Mean :20.12 Mean : 73.77 Mean :2.396
## 3rd Qu.:55.00 3rd Qu.:30.00 3rd Qu.: 98.00 3rd Qu.:3.000
## Max. :67.00 Max. :43.00 Max. :224.00 Max. :4.000
##
## CCAvg Education Mortgage Personal.Loan
## Min. : 0.000 Min. :1.000 Min. : 0.00 Min. :0.000
## 1st Qu.: 0.700 1st Qu.:1.000 1st Qu.: 0.00 1st Qu.:0.000
## Median : 1.500 Median :2.000 Median : 0.00 Median :0.000
## Mean : 1.938 Mean :1.881 Mean : 56.52 Mean :0.096
## 3rd Qu.: 2.500 3rd Qu.:3.000 3rd Qu.:101.00 3rd Qu.:0.000
## Max. :10.000 Max. :3.000 Max. :635.00 Max. :1.000
## NA's :2
## Securities.Account CD.Account Online CreditCard
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.0000 Median :0.00000 Median :1.0000 Median :0.000
## Mean :0.1044 Mean :0.06041 Mean :0.5968 Mean :0.294
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:1.000
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.000
## NA's :2 NA's :1
df[,cat_cols] = data.frame(apply(df[,cat_cols],2,as.factor))
for (i in cat_cols){
if (i != "ZIP.Code")barplot(table(df[,i]),col = "brown",main = paste("Distribution of ",i))
}
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
ggplot(df, aes(x=Online, y=Income)) +
geom_boxplot(aes(fill=Online))
set.seed(007) # set seed for reproducible results
library(caret)
## Loading required package: lattice
train_rows <- createDataPartition(df$Income, p = .8,
list = FALSE,
times = 1)
train_data <- df[train_rows, ]
test_data <- df[-train_rows, ]
library(corrplot)
## corrplot 0.84 loaded
corrplot(cor(train_data[num_cols],use="complete.obs"),title = "Correlation plot for train data")
corrplot(cor(test_data[num_cols],use="complete.obs"),title = "Correlation plot for test data")
library(RANN)
library(DMwR)
## Loading required package: grid
impute_num = preProcess(x = train_data[, !colnames(train_data) %in% c("Income")], method = c("knnImpute"))
train_data = predict(impute_num, train_data)
test_data = predict(impute_num, test_data)
for (x in cat_cols){
# print (names(which(table(train_data[,x]) == max(table(train_data[,x])))))
subs = names(which(table(train_data[,x]) == max(table(train_data[,x]))))
train_data[,x][is.na(train_data[,x])] = subs
test_data[,x][is.na(test_data[,x])] = subs
}
sum(is.na(train_data))
## [1] 0
std_obj <- preProcess(x = train_data[, !colnames(train_data) %in% c("Income")],
method = c("center", "scale"))
train_std_data <- predict(std_obj, train_data)
test_std_data <- predict(std_obj, test_data)
str(train_std_data)
## 'data.frame': 4002 obs. of 12 variables:
## $ Age : num -0.542 -0.892 -0.892 -0.717 0.681 ...
## $ Experience : num -0.436 -0.962 -1.05 -0.612 0.615 ...
## $ Income : int 11 100 45 29 72 81 45 114 40 112 ...
## $ Family : Factor w/ 4 levels "1","2","3","4": 1 1 4 4 2 3 3 2 4 1 ...
## $ CCAvg : num -0.539 0.419 -0.539 -0.877 -0.257 ...
## $ Education : Factor w/ 3 levels "1","2","3": 1 2 2 2 2 2 2 3 2 1 ...
## $ Mortgage : num -0.557 -0.557 -0.557 0.976 -0.557 ...
## $ Personal.Loan : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Securities.Account: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 2 ...
## $ CD.Account : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Online : Factor w/ 2 levels "0","1": 1 1 1 2 2 2 2 1 2 1 ...
## $ CreditCard : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 1 1 ...
dummy_obj <- dummyVars( ~ . , train_std_data)
train_dummy_data <- as.data.frame(predict(dummy_obj, train_std_data))
test_dummy_data <- as.data.frame(predict(dummy_obj, test_std_data))
model_basic <- lm(formula = Income~. , data = train_dummy_data)
summary(model_basic)
##
## Call:
## lm(formula = Income ~ ., data = train_dummy_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -92.875 -22.530 -5.033 19.322 137.069
##
## Coefficients: (7 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 108.5566 2.6559 40.874 < 2e-16 ***
## Age -12.1438 4.6377 -2.618 0.00887 **
## Experience 10.7402 4.6329 2.318 0.02049 *
## Family.1 9.8155 1.3412 7.319 3.02e-13 ***
## Family.2 12.4143 1.3836 8.973 < 2e-16 ***
## Family.3 2.6667 1.4505 1.838 0.06607 .
## Family.4 NA NA NA NA
## CCAvg 21.6614 0.5402 40.097 < 2e-16 ***
## Education.1 15.4095 1.2340 12.487 < 2e-16 ***
## Education.2 -0.6444 1.2819 -0.503 0.61523
## Education.3 NA NA NA NA
## Mortgage 4.0732 0.4892 8.327 < 2e-16 ***
## Personal.Loan.0 -56.0914 1.9142 -29.303 < 2e-16 ***
## Personal.Loan.1 NA NA NA NA
## Securities.Account.0 1.4643 1.6957 0.864 0.38789
## Securities.Account.1 NA NA NA NA
## CD.Account.0 2.5523 2.4119 1.058 0.29002
## CD.Account.1 NA NA NA NA
## Online.0 -1.3242 1.0039 -1.319 0.18722
## Online.1 NA NA NA NA
## CreditCard.0 -0.1492 1.1123 -0.134 0.89332
## CreditCard.1 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30.42 on 3987 degrees of freedom
## Multiple R-squared: 0.5669, Adjusted R-squared: 0.5654
## F-statistic: 372.8 on 14 and 3987 DF, p-value: < 2.2e-16
# par(mfrow = c(2,2))
plot(model_basic)
library(DMwR)
preds_model <- predict(model_basic, test_dummy_data[, !(names(test_dummy_data) %in% c("Income"))])
# sum(is.na(train_dummy_data))
preds_train = predict(model_basic, train_dummy_data[, !(names(train_dummy_data) %in% c("Income"))])
regr.eval(trues =train_dummy_data$Income,preds = preds_train)
## mae mse rmse mape
## 24.23469 921.74599 30.36027 0.53903
regr.eval(trues =test_dummy_data$Income,preds = preds_model)
## mae mse rmse mape
## 25.1945052 995.3968233 31.5499100 0.5710639
library(MASS)
model_aic <- stepAIC(model_basic, direction = "both")
## Start: AIC=27348.73
## Income ~ Age + Experience + Family.1 + Family.2 + Family.3 +
## Family.4 + CCAvg + Education.1 + Education.2 + Education.3 +
## Mortgage + Personal.Loan.0 + Personal.Loan.1 + Securities.Account.0 +
## Securities.Account.1 + CD.Account.0 + CD.Account.1 + Online.0 +
## Online.1 + CreditCard.0 + CreditCard.1
##
##
## Step: AIC=27348.73
## Income ~ Age + Experience + Family.1 + Family.2 + Family.3 +
## Family.4 + CCAvg + Education.1 + Education.2 + Education.3 +
## Mortgage + Personal.Loan.0 + Personal.Loan.1 + Securities.Account.0 +
## Securities.Account.1 + CD.Account.0 + CD.Account.1 + Online.0 +
## Online.1 + CreditCard.0
##
##
## Step: AIC=27348.73
## Income ~ Age + Experience + Family.1 + Family.2 + Family.3 +
## Family.4 + CCAvg + Education.1 + Education.2 + Education.3 +
## Mortgage + Personal.Loan.0 + Personal.Loan.1 + Securities.Account.0 +
## Securities.Account.1 + CD.Account.0 + CD.Account.1 + Online.0 +
## CreditCard.0
##
##
## Step: AIC=27348.73
## Income ~ Age + Experience + Family.1 + Family.2 + Family.3 +
## Family.4 + CCAvg + Education.1 + Education.2 + Education.3 +
## Mortgage + Personal.Loan.0 + Personal.Loan.1 + Securities.Account.0 +
## Securities.Account.1 + CD.Account.0 + Online.0 + CreditCard.0
##
##
## Step: AIC=27348.73
## Income ~ Age + Experience + Family.1 + Family.2 + Family.3 +
## Family.4 + CCAvg + Education.1 + Education.2 + Education.3 +
## Mortgage + Personal.Loan.0 + Personal.Loan.1 + Securities.Account.0 +
## CD.Account.0 + Online.0 + CreditCard.0
##
##
## Step: AIC=27348.73
## Income ~ Age + Experience + Family.1 + Family.2 + Family.3 +
## Family.4 + CCAvg + Education.1 + Education.2 + Education.3 +
## Mortgage + Personal.Loan.0 + Securities.Account.0 + CD.Account.0 +
## Online.0 + CreditCard.0
##
##
## Step: AIC=27348.73
## Income ~ Age + Experience + Family.1 + Family.2 + Family.3 +
## Family.4 + CCAvg + Education.1 + Education.2 + Mortgage +
## Personal.Loan.0 + Securities.Account.0 + CD.Account.0 + Online.0 +
## CreditCard.0
##
##
## Step: AIC=27348.73
## Income ~ Age + Experience + Family.1 + Family.2 + Family.3 +
## CCAvg + Education.1 + Education.2 + Mortgage + Personal.Loan.0 +
## Securities.Account.0 + CD.Account.0 + Online.0 + CreditCard.0
##
## Df Sum of Sq RSS AIC
## - CreditCard.0 1 17 3688844 27347
## - Education.2 1 234 3689061 27347
## - Securities.Account.0 1 690 3689517 27348
## - CD.Account.0 1 1036 3689864 27348
## - Online.0 1 1610 3690437 27348
## <none> 3688827 27349
## - Family.3 1 3127 3691955 27350
## - Experience 1 4972 3693800 27352
## - Age 1 6344 3695171 27354
## - Family.1 1 49555 3738383 27400
## - Mortgage 1 64155 3752982 27416
## - Family.2 1 74487 3763315 27427
## - Education.1 1 144270 3833097 27500
## - Personal.Loan.0 1 794427 4483255 28127
## - CCAvg 1 1487524 5176352 28703
##
## Step: AIC=27346.75
## Income ~ Age + Experience + Family.1 + Family.2 + Family.3 +
## CCAvg + Education.1 + Education.2 + Mortgage + Personal.Loan.0 +
## Securities.Account.0 + CD.Account.0 + Online.0
##
## Df Sum of Sq RSS AIC
## - Education.2 1 234 3689078 27345
## - Securities.Account.0 1 729 3689573 27346
## - CD.Account.0 1 1066 3689910 27346
## - Online.0 1 1596 3690440 27346
## <none> 3688844 27347
## - Family.3 1 3118 3691962 27348
## + CreditCard.0 1 17 3688827 27349
## + CreditCard.1 1 17 3688827 27349
## - Experience 1 4968 3693812 27350
## - Age 1 6339 3695183 27352
## - Family.1 1 49539 3738383 27398
## - Mortgage 1 64157 3753001 27414
## - Family.2 1 74535 3763379 27425
## - Education.1 1 144276 3833120 27498
## - Personal.Loan.0 1 799297 4488141 28130
## - CCAvg 1 1487902 5176746 28701
##
## Step: AIC=27345
## Income ~ Age + Experience + Family.1 + Family.2 + Family.3 +
## CCAvg + Education.1 + Mortgage + Personal.Loan.0 + Securities.Account.0 +
## CD.Account.0 + Online.0
##
## Df Sum of Sq RSS AIC
## - Securities.Account.0 1 738 3689817 27344
## - CD.Account.0 1 1053 3690131 27344
## - Online.0 1 1559 3690637 27345
## <none> 3689078 27345
## - Family.3 1 3060 3692138 27346
## + Education.2 1 234 3688844 27347
## + Education.3 1 234 3688844 27347
## + CreditCard.0 1 17 3689061 27347
## + CreditCard.1 1 17 3689061 27347
## - Experience 1 4772 3693851 27348
## - Age 1 6127 3695205 27350
## - Family.1 1 50361 3739440 27397
## - Mortgage 1 64166 3753244 27412
## - Family.2 1 75279 3764358 27424
## - Education.1 1 207991 3897069 27562
## - Personal.Loan.0 1 799863 4488941 28128
## - CCAvg 1 1487692 5176771 28699
##
## Step: AIC=27343.8
## Income ~ Age + Experience + Family.1 + Family.2 + Family.3 +
## CCAvg + Education.1 + Mortgage + Personal.Loan.0 + CD.Account.0 +
## Online.0
##
## Df Sum of Sq RSS AIC
## - Online.0 1 1700 3691517 27344
## <none> 3689817 27344
## - CD.Account.0 1 2011 3691828 27344
## + Securities.Account.0 1 738 3689078 27345
## + Securities.Account.1 1 738 3689078 27345
## - Family.3 1 3120 3692937 27345
## + Education.2 1 243 3689573 27346
## + Education.3 1 243 3689573 27346
## + CreditCard.0 1 57 3689759 27346
## + CreditCard.1 1 57 3689759 27346
## - Experience 1 4809 3694626 27347
## - Age 1 6169 3695985 27348
## - Family.1 1 50600 3740417 27396
## - Mortgage 1 64596 3754413 27411
## - Family.2 1 75626 3765443 27423
## - Education.1 1 208145 3897962 27561
## - Personal.Loan.0 1 810055 4499872 28136
## - CCAvg 1 1487141 5176958 28697
##
## Step: AIC=27343.65
## Income ~ Age + Experience + Family.1 + Family.2 + Family.3 +
## CCAvg + Education.1 + Mortgage + Personal.Loan.0 + CD.Account.0
##
## Df Sum of Sq RSS AIC
## - CD.Account.0 1 1437 3692955 27343
## <none> 3691517 27344
## + Online.0 1 1700 3689817 27344
## + Online.1 1 1700 3689817 27344
## + Securities.Account.0 1 880 3690637 27345
## + Securities.Account.1 1 880 3690637 27345
## - Family.3 1 3048 3694565 27345
## + Education.2 1 204 3691313 27345
## + Education.3 1 204 3691313 27345
## + CreditCard.0 1 30 3691487 27346
## + CreditCard.1 1 30 3691487 27346
## - Experience 1 4883 3696400 27347
## - Age 1 6236 3697753 27348
## - Family.1 1 50264 3741781 27396
## - Mortgage 1 64218 3755735 27411
## - Family.2 1 74982 3766500 27422
## - Education.1 1 208048 3899565 27561
## - Personal.Loan.0 1 808355 4499872 28134
## - CCAvg 1 1487278 5178795 28696
##
## Step: AIC=27343.21
## Income ~ Age + Experience + Family.1 + Family.2 + Family.3 +
## CCAvg + Education.1 + Mortgage + Personal.Loan.0
##
## Df Sum of Sq RSS AIC
## <none> 3692955 27343
## + Securities.Account.0 1 1678 3691276 27343
## + Securities.Account.1 1 1678 3691276 27343
## + CD.Account.1 1 1437 3691517 27344
## + CD.Account.0 1 1437 3691517 27344
## + Online.0 1 1127 3691828 27344
## + Online.1 1 1127 3691828 27344
## - Family.3 1 2934 3695889 27344
## + Education.2 1 199 3692756 27345
## + Education.3 1 199 3692756 27345
## + CreditCard.0 1 38 3692917 27345
## + CreditCard.1 1 38 3692917 27345
## - Experience 1 4860 3697814 27346
## - Age 1 6210 3699165 27348
## - Family.1 1 50026 3742981 27395
## - Mortgage 1 63535 3756490 27410
## - Family.2 1 74890 3767845 27422
## - Education.1 1 207290 3900244 27560
## - Personal.Loan.0 1 855098 4548053 28175
## - CCAvg 1 1487253 5180208 28696
summary(model_aic)
##
## Call:
## lm(formula = Income ~ Age + Experience + Family.1 + Family.2 +
## Family.3 + CCAvg + Education.1 + Mortgage + Personal.Loan.0,
## data = train_dummy_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -91.463 -22.411 -5.077 18.999 136.453
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 110.7659 1.8273 60.616 < 2e-16 ***
## Age -11.9067 4.5954 -2.591 0.0096 **
## Experience 10.5265 4.5928 2.292 0.0220 *
## Family.1 9.8260 1.3362 7.354 2.33e-13 ***
## Family.2 12.4162 1.3800 8.997 < 2e-16 ***
## Family.3 2.5786 1.4480 1.781 0.0750 .
## CCAvg 21.6531 0.5400 40.096 < 2e-16 ***
## Education.1 15.7043 1.0491 14.969 < 2e-16 ***
## Mortgage 4.0455 0.4882 8.287 < 2e-16 ***
## Personal.Loan.0 -55.4602 1.8242 -30.403 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30.42 on 3992 degrees of freedom
## Multiple R-squared: 0.5665, Adjusted R-squared: 0.5655
## F-statistic: 579.5 on 9 and 3992 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(model_aic)
preds_train = predict(model_aic, train_dummy_data[, !(names(train_dummy_data) %in% c("Income"))])
regr.eval(trues =train_dummy_data$Income,preds = preds_train)
## mae mse rmse mape
## 24.2465115 922.7772825 30.3772494 0.5396908
preds_model <- predict(model_aic, test_dummy_data[, !(names(test_dummy_data) %in% c("Income"))])
regr.eval(trues =test_dummy_data$Income,preds = preds_model)
## mae mse rmse mape
## 25.1975302 996.4818208 31.5671003 0.5710043
RIDGE - \[RSS(\beta) + \lambda \sum_{j=1}^{p} \beta_j^2\]
LASSO - \[RSS(\beta) + \lambda \sum_{j=1}^{p} |\beta_j|\]
Elastic Net - \[ RSS (\beta) + \lambda (\alpha (\sum_{j=1}^{p} |\beta_j|) + (1-\alpha)(\sum_{j=1}^{p} \beta_j^2))\]
p - number of attributes
X_train <- as.matrix(train_dummy_data[, -3])
y_train <- as.matrix(train_dummy_data[, 3])
X_test <- as.matrix(test_dummy_data[, -3])
y_test <- as.matrix(test_dummy_data[, 3])
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
lasso_model <- glmnet(X_train, y_train, lambda = 0.5, alpha = 1)
coef(lasso_model)
## 22 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 1.181760e+02
## Age -9.420483e-01
## Experience .
## Family.1 .
## Family.2 2.500672e+00
## Family.3 -5.798457e+00
## Family.4 -8.333367e+00
## CCAvg 2.153744e+01
## Education.1 1.540275e+01
## Education.2 .
## Education.3 .
## Mortgage 3.676867e+00
## Personal.Loan.0 -5.384254e+01
## Personal.Loan.1 .
## Securities.Account.0 5.121509e-01
## Securities.Account.1 -1.570836e-11
## CD.Account.0 .
## CD.Account.1 .
## Online.0 -3.201038e-02
## Online.1 .
## CreditCard.0 .
## CreditCard.1 .
preds_train = predict(lasso_model, X_train)
regr.eval(trues =train_dummy_data$Income,preds = preds_train)
## mae mse rmse mape
## 24.3088720 925.4101221 30.4205543 0.5432273
preds_lasso <- predict(lasso_model, X_test)
regr.eval(preds_lasso,test_dummy_data)
## mae mse rmse mape
## 1541.09419 131665.55505 362.85749 21.28068
ridge_model <- glmnet(X_train, y_train, lambda = 0.5, alpha = 0)
coef(ridge_model)
## 22 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 95.27646193
## Age -4.46028745
## Experience 3.06375194
## Family.1 3.60075321
## Family.2 6.18847943
## Family.3 -3.71320864
## Family.4 -6.25291153
## CCAvg 21.39328021
## Education.1 10.58453768
## Education.2 -5.70000717
## Education.3 -5.33333477
## Mortgage 4.04879416
## Personal.Loan.0 -30.27179285
## Personal.Loan.1 25.92683125
## Securities.Account.0 0.82456369
## Securities.Account.1 -0.70373302
## CD.Account.0 1.24328510
## CD.Account.1 -1.08213305
## Online.0 -0.70473971
## Online.1 0.60571708
## CreditCard.0 -0.05180340
## CreditCard.1 0.04687114
preds_train = predict(ridge_model, X_train)
regr.eval(trues =train_dummy_data$Income,preds = preds_train)
## mae mse rmse mape
## 24.2620069 922.4382277 30.3716682 0.5408983
preds_ridge <- predict(ridge_model, X_test)
regr.eval(preds_ridge,test_dummy_data)
## mae mse rmse mape
## 1540.80880 132494.58744 363.99806 21.28001
The alpha value is 1 for lasso regression
in the plots, the numbers at the top signify the number of co-efficients which are not 0.
library(glmnet)
cv_lasso <- cv.glmnet(X_train, y_train, alpha = 1, type.measure = "mse", nfolds = 4)
plot(cv_lasso)
plot(cv_lasso$glmnet.fit, xvar="lambda", label=TRUE)
The object returned form the call to cv.glmnet() function, contains the lambda values of importance
The coefficients are accessible calling the coef() function on the cv_lasso object
lambda - the values of lambda used in the fits.
cvm - The mean cross-validated error - a vector of length length(lambda).
cvsd - estimate of standard error of cvm.
cvup - upper curve = cvm+cvsd.
cvlo - lower curve = cvm-cvsd.
nzero - number of non-zero coefficients at each lambda.
print(cv_lasso$lambda.min)
## [1] 0.1649402
coef(cv_lasso)
## 22 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 115.5261184
## Age -0.1788653
## Experience .
## Family.1 .
## Family.2 2.3842716
## Family.3 -3.2641477
## Family.4 -5.9450225
## CCAvg 21.4414977
## Education.1 14.0703963
## Education.2 .
## Education.3 .
## Mortgage 3.1171028
## Personal.Loan.0 -50.9794016
## Personal.Loan.1 .
## Securities.Account.0 .
## Securities.Account.1 .
## CD.Account.0 .
## CD.Account.1 .
## Online.0 .
## Online.1 .
## CreditCard.0 .
## CreditCard.1 .
cv_ridge <- cv.glmnet(X_train, y_train, alpha = 0, type.measure = "mse", nfolds = 4)
plot(cv_ridge)
plot(cv_ridge$glmnet.fit, xvar="lambda", label=TRUE)
print(cv_ridge$lambda.min)
## [1] 3.314034
coef(cv_ridge)
## 22 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 94.1568118
## Age -0.9456881
## Experience -0.4442522
## Family.1 3.0723120
## Family.2 5.6821293
## Family.3 -3.7913918
## Family.4 -6.0392438
## CCAvg 17.0861459
## Education.1 9.3783578
## Education.2 -5.7008327
## Education.3 -5.3196670
## Mortgage 3.6131927
## Personal.Loan.0 -26.5715485
## Personal.Loan.1 26.4904282
## Securities.Account.0 1.0035395
## Securities.Account.1 -1.0175280
## CD.Account.0 -0.7767520
## CD.Account.1 0.8327770
## Online.0 -0.4241813
## Online.1 0.4187606
## CreditCard.0 0.2546511
## CreditCard.1 -0.2608925
lasso_model <- glmnet(X_train, y_train, lambda = cv_lasso$lambda.min, alpha = 1)
coef(lasso_model)
## 22 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 1.115341e+02
## Age -1.280680e+00
## Experience .
## Family.1 6.703264e+00
## Family.2 9.283041e+00
## Family.3 -1.517758e-01
## Family.4 -2.667231e+00
## CCAvg 2.158039e+01
## Education.1 1.596158e+01
## Education.2 -6.458688e-02
## Education.3 .
## Mortgage 3.933624e+00
## Personal.Loan.0 -5.540454e+01
## Personal.Loan.1 .
## Securities.Account.0 1.276576e+00
## Securities.Account.1 -2.432836e-12
## CD.Account.0 1.315215e+00
## CD.Account.1 -1.038436e-11
## Online.0 -8.767103e-01
## Online.1 .
## CreditCard.0 .
## CreditCard.1 .
cv_lasso$lambda.min
## [1] 0.1649402
preds_lasso <- predict(lasso_model, X_test)
library(DMwR)
preds_train = predict(lasso_model, X_train)
regr.eval(trues =train_dummy_data$Income,preds = preds_train)
## mae mse rmse mape
## 24.2703580 923.2855431 30.3856141 0.5406332
regr.eval(trues = y_test, preds = preds_lasso)
## mae mse rmse mape
## 25.2242720 995.7513498 31.5555280 0.5709309
ridge_model <- glmnet(X_train, y_train, lambda = cv_ridge$lambda.min, alpha = 0)
coef(ridge_model)
## 22 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 94.24893516
## Age -1.53840640
## Experience 0.13665639
## Family.1 3.34495287
## Family.2 5.95414145
## Family.3 -3.95158053
## Family.4 -6.38782030
## CCAvg 20.22347784
## Education.1 10.11130276
## Education.2 -6.02102533
## Education.3 -5.72436923
## Mortgage 3.94331563
## Personal.Loan.0 -28.10605658
## Personal.Loan.1 27.60317479
## Securities.Account.0 0.86320598
## Securities.Account.1 -0.83952187
## CD.Account.0 0.61149182
## CD.Account.1 -0.63340609
## Online.0 -0.59568310
## Online.1 0.58745426
## CreditCard.0 0.04887138
## CreditCard.1 -0.04353616
preds_ridge <- predict(ridge_model, X_test)
library(DMwR)
preds_train = predict(ridge_model,X_train)
regr.eval(trues =train_dummy_data$Income,preds = preds_train)
## mae mse rmse mape
## 24.3540259 925.1754604 30.4166971 0.5498136
regr.eval(trues = y_test, preds = preds_ridge)
## mae mse rmse mape
## 25.2760941 996.1529236 31.5618904 0.5799739
library(glmnet)
# Grid search
sampling_strategy <- trainControl(method = "cv", number = 50)
elastic_net_model <- train(Income ~ .,
train_dummy_data,
method = "glmnet",
trControl = sampling_strategy,
metric = "RMSE",
tuneGrid = expand.grid(.alpha = seq(.005,1,length = 10),.lambda = c((1:20)/10)))
#elastic_net_model$finalModel
plot(elastic_net_model)
preds_train = predict(elastic_net_model, train_dummy_data[, !(names(train_dummy_data) %in% c("Income"))])
regr.eval(trues =train_dummy_data$Income,preds = preds_train)
## mae mse rmse mape
## 24.2465056 922.0774990 30.3657290 0.5395378
preds_elastic <- predict(elastic_net_model, test_dummy_data)
regr.eval(trues = test_dummy_data[, 1], preds = preds_elastic)
## mae mse rmse mape
## 72.44769 6320.38593 79.50085 242.06306